This is my data vizualisation portfolio.
I gathered code I wrote to create some nice plots, put them in an Rmarkdown to create an HTML page where I would have an easy and centralized access to all of those code lines. The goal is to preview the graphs so I can identify the features I want to re-use while creating a new plot, and to be able to copy and paste the chunks I want.
The .Rmd file can run by itself, all the data is either simulated or arbitrarily defined.
Below is the list of all the required packages. Mostly graphical tools, along with some statistical analysis ressources.
# Tidy data
library(tidyverse)
library(lubridate)
# Supplementary analysis
library(survival)
library(TraMineR)
library(rstatix)
library(epiR)
# Data
library(ISLR)
# plotly
library(plotly)
# ggplot2 and addins
library(ggplot2)
library(ggfortify)
library(ggpubr)
library(gridExtra)
library(ggseqplot)
library(ggh4x)
library(ggstatsplot)
# Other graphics tools
library(treemap)
Starting with the basics : histograms, barplots, pie charts…
Basic is good, but make it pretty.
An horizontal and stacked barplot :
# Simulate data for a barplot
dataBarplot <- data.frame(
Episode = rep(c("First Covid-19 episode", "Last Covid-19 episode"), each = 63), # Two Covid episodes
Time = factor(rep(rep(c("Acute phase", "3 months or more", "Still persisting"), each = 21), 2),
levels = c("Acute phase", "3 months or more", "Still persisting")), # Symptoms last for a given time
Symptoms = rep(c("Abdominal pain", "Anxiety", "Chest pain", "Cognitive disfunction", "Cough", # List of possible symptoms
"Depression", "Dizziness", "Fatigue", "Fever", "Gastrointestinal problem",
"Headache", "Joint pain", "Loss of smell or taste", "Menstruation issue", "Neuralgia",
"New allergies", "Shortness of breath", "Sleeping disorder", "Tachycardia", "Tinnitus",
"Troubled vision"), 6),
N = c(0, 0, 0, 0, 2, 5, 9, 285, 404, 714, 646, 870, 765, 1873, 2632, 2106, 3181, 3844, # Number of cases
3985, 4921, 3529, 0, 0, 0, 0, 0, 0, 3, 38, 51, 49, 82, 116, 119, 287, 142, 448,
190, 336, 158, 81, 502, 0, 1, 4, 6, 12, 12, 6, 315, 243, 192, 232, 212, 930, 394,
394, 902, 781, 351, 460, 92, 1318, 0, 0, 0, 0, 0, 0, 0, 71, 62, 119, 148, 162,
217, 123, 427, 347, 520, 669, 825, 730, 574, 0, 0, 0, 0, 1, 0, 0, 3, 7, 13, 7, 10,
22, 8, 13, 40, 15, 34, 18, 17, 36, 0, 1, 1, 2, 1, 2, 2, 61, 67, 59, 51, 58, 85,
217, 84, 200, 172, 79, 18, 115, 296)) %>%
# Total number of cases per symptoms and episode
mutate(Ntot = c(rep(by(N[Episode == "First Covid-19 episode"], Symptoms[Episode == "First Covid-19 episode"], sum), 3),
rep(by(N[Episode == "Last Covid-19 episode"], Symptoms[Episode == "Last Covid-19 episode"], sum), 3)))
# Initialize the plot with the dataset (DATA_ex2_1)
ggplot(dataBarplot) +
# Add a stacked bar chart (geom_col creates bars with heights determined by 'N')
geom_col(
mapping = aes(N, reorder(Symptoms, -N), fill = Time), # Map 'N' to x, 'Symptome' to y, and 'Temps' to fill color
width = 0.6, # Set the bar width
position = "stack" # Stack bars by the 'fill' variable
) +
# Create facets (separate panels) for each level of the 'Episode' variable
facet_grid(cols = vars(Episode)) +
# Add a vertical line at x = 0 for reference
geom_vline(aes(xintercept = 0)) +
# Customize the x-axis scale
scale_x_continuous(
limits = c(0, 6500), # Set the range for the x-axis
breaks = seq(0, 6000, 1000), # Major tick marks every 1000 units
minor_breaks = seq(500, 6500, 500), # Minor tick marks every 500 units
expand = c(0, 0) # Remove padding on the x-axis
) +
scale_fill_manual(
name = "Symptoms duration",
values = c('#28AFB0', '#F6803C', '#82354F')
) +
# Add text labels to display 'Ntot' values next to each bar
geom_text(
mapping = aes(Ntot + 25, y = Symptoms, label = Ntot), # Position and content of the text
hjust = 0, # Align text to the left of its position
nudge_x = 1 # Slightly nudge text horizontally
) +
# Add titles and subtitles (empty here for now)
labs(title = "", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.x = element_line(colour = "lightgray"), # Minor grid lines for x-axis
legend.position = "bottom"
)
This one is vertical and dodged :
# Simulate data for a barplot
dataBarplot <- data.frame(
Time = factor(rep(c("Acute phase", "3 months or more", "Still persisting"), each = 14),
levels = c("Acute phase", "3 months or more", "Still persisting")), # Symptoms last for a given time
Symptoms = rep(c("Fatigue", "Loss of smell or taste", "Menstruation issue", "Neuralgia", "Fever", "Gastrointestinal problem",
"Headache", "Sleeping disorder", "Tachycardia", "Joint pain",
"New allergies", "Shortness of breath", "Tinnitus",
"Troubled vision"), 3),
N = c(285, 404, 714, 646, 870, 765, 1873, 2632, 2106, 3181, 3844, # Number of cases
3985, 4921, 3529, 38, 51, 49, 82, 116, 119, 287, 142, 448,
190, 336, 158, 81, 502, 315, 243, 192, 232, 212, 930, 394,
394, 902, 781, 351, 460, 92, 1318))
# Initialize the plot with the dataset (DATA_ex2_1)
ggplot(dataBarplot) +
# Add a stacked bar chart (geom_col creates bars with heights determined by 'N')
geom_col(
mapping = aes(reorder(Symptoms, -N), N, fill = Time), # Map 'N' to x, 'Symptome' to y, and 'Temps' to fill color
width = 0.9, # Set the bar width
position = "dodge" # Stack bars by the 'fill' variable
) +
# Add a vertical line at x = 0 for reference
geom_hline(aes(yintercept = 0)) +
# Customize the x-axis scale
scale_y_continuous(
limits = c(0, 5500), # Set the range for the x-axis
breaks = seq(0, 5000, 1000), # Major tick marks every 1000 units
minor_breaks = seq(500, 5500, 500), # Minor tick marks every 500 units
expand = c(0, 0) # Remove padding on the x-axis
) +
scale_fill_manual(
name = "Symptoms duration",
values = c('#28AFB0', '#F6803C', '#82354F')
) +
# Add text labels to display 'Ntot' values next to each bar
geom_text(
mapping = aes(x = Symptoms, y = N, label = N), # Position and content of the text
nudge_x = rep(c(-0.3, 0, 0.3), each = 14),
nudge_y = 120,
size = 2
) +
# Add titles and subtitles (empty here for now)
labs(title = "", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.y = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.y = element_line(colour = "lightgray"), # Minor grid lines for x-axis
legend.position = "bottom",
axis.text.x = element_text(
size = 10, # Font size
angle = 45, # Rotate text 45 degrees
vjust = 1, # Vertical alignment
hjust = 1 # Horizontal alignment
)
)
I like to display percentages as stacked barplots, to make it obvious that values sum up to a hundred.
This one has and grid according to the status, and I feel like it is easier to read the proportions differences whenever the bars are horizontal :
# Simulate data for a bar plot
dataBarplot <- data.frame(
# Create a "Status" column with repeated categories (12 repetitions for each category)
Status = rep(c("Family member", "Active worker", "Retired worker"), each = 12),
# Create an "Affection" column, categorizing Covid-19 status, with ordered factor levels
Affection = factor(
rep(rep(c("No Covid-19", "Covid-19", "Long Covid-19"), each = 4), 3),
levels = c("No Covid-19", "Covid-19", "Long Covid-19")
),
# Create a "Social_satisfaction" column, representing satisfaction levels, with ordered factor levels
Social_satisfaction = factor(
rep(c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied"), 9),
levels = c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied")
),
# Define a numeric vector "N" representing counts corresponding to the combinations above
N = c(65, 255, 70, 16, 38, 131, 41, 7, 21, 123, 59, 10,
819, 2532, 591, 130, 640, 1723, 357, 38, 235, 1134, 468, 68,
1444, 4212, 677, 128, 528, 1198, 164, 20, 160, 575, 166, 27)
) %>%
# Add new columns using dplyr's mutate function
mutate(
# Compute the total count (Ntot) for each "Affection" group within each "Status" group
Ntot = c(
# For "Family member", calculate totals per "Affection" group
rep(by(N[Status == "Family member"], Affection[Status == "Family member"], sum), each = 4),
# For "Active worker", calculate totals per "Affection" group
rep(by(N[Status == "Active worker"], Affection[Status == "Active worker"], sum), each = 4),
# For "Retired worker", calculate totals per "Affection" group
rep(by(N[Status == "Retired worker"], Affection[Status == "Retired worker"], sum), each = 4)
),
# Calculate percentages (P) for each count (N) relative to the total count (Ntot) per group
P = N / Ntot * 100
)
# Initialize the plot with the dataset (DATA_ex2_2)
ggplot(dataBarplot) +
# Add a stacked bar chart
geom_col(
mapping = aes(P, Affection, fill = Social_satisfaction), # Map 'P' to x, 'COVID_long_3mod' to y, and 'Q_E1' to fill
width = 0.6, # Set the bar width
position = "stack", # Stack bars by the 'fill' variable
color = "white" # Add white border to bars
) +
# Create facets (separate rows) for each level of the Status variable
facet_grid(rows = vars(Status)) +
# Add vertical lines at x = 0 and x = 100 for reference
geom_vline(aes(xintercept = 0)) +
geom_vline(aes(xintercept = 100)) +
# Customize the x-axis scale
scale_x_continuous(
limits = c(0, 100.2), # Set the range for the x-axis
breaks = seq(0, 100, 10), # Major tick marks every 10%
minor_breaks = seq(5, 95, 10), # Minor tick marks every 5%
expand = c(0, 0), # Remove padding on the x-axis
labels = paste0(seq(0, 100, 10), "%") # Append "%" to the tick labels
) +
# Add titles and subtitles (empty here for now)
labs(title = " ", subtitle = "") +
# Customize the overall appearance of the plot
theme(
axis.title = element_blank(), # Remove axis titles
# axis.text.x = element_blank(), # Uncomment to hide x-axis labels
# axis.text.y = element_blank(), # Uncomment to hide y-axis labels
panel.background = element_rect(fill = "white"), # Set panel background to white
panel.grid.major.x = element_line(color = "darkgray"), # Major grid lines for x-axis
panel.grid.minor.x = element_line(colour = "lightgray") # Minor grid lines for x-axis
) +
# Customize the fill legend for the bar chart
scale_fill_manual(
"Social interactions", # Title for the legend
values = rev(RColorBrewer::brewer.pal(4, "RdYlGn")), # Reverse color palette
labels = c( # Define labels for legend items
"Very satisfied", # Very satisfactory
"Satisfied", # Somewhat satisfactory
"Unsatisfied", # Somewhat unsatisfactory
"Very unsatisfied" # Very unsatisfactory
)
)
Adding some percentages labels and handling long variables names :
# Create the data frame for the bar plot
dataBarplot <- data.frame(
# "LTI" column with long-term illness categories repeated for each age group
LTI = factor(rep(
c("Leukaemia", "Diabetes (insipidus & mellitus)", "Epilepsy", "Multiple Sclerosis",
"Parkinson & affiliated", "Mental Illness"), 5),
levels = c("Leukaemia", "Diabetes (insipidus & mellitus)", "Epilepsy", "Multiple Sclerosis",
"Parkinson & affiliated", "Mental Illness")
),
# "Age" column with age group categories repeated for each illness type
Age = factor(rep(
c("Under 18", "18 to 24 years old", "25 to 44 years old", "45 to 59 years old", "Over 60"), each = 6),
levels = c("Under 18", "18 to 24 years old", "25 to 44 years old", "45 to 59 years old", "Over 60")
),
# Randomly generated values for the "N" column representing counts
N = sample(100:300, 30)
) %>%
# Calculate the percentage contribution of each count (N) within each illness type (LTI)
mutate(
percent = as.numeric(unlist(by(N, LTI, function(x) {x / sum(x) * 100})))[rep(seq(1, 26, 5), 5) + rep(0:4, each = 6)],
# Label for percentages, shown only if the value exceeds 2%
lab = ifelse(percent > 2, paste0(formatC(percent, 1, format = "f"), "%"), "")
)
# Initialize a vector for positioning text labels within each bar
x_text <- rep(NA, nrow(dataBarplot))
# Calculate the cumulative y-position for placing the text labels
for (i in unique(dataBarplot$LTI)) {
for (j in 1:sum(dataBarplot$LTI == i)) {
ifelse(j == 1,
x_text[which(dataBarplot$LTI == i)[j]] <- dataBarplot$percent[dataBarplot$LTI == i][j] / 2,
x_text[which(dataBarplot$LTI == i)[j]] <- sum(dataBarplot$percent[dataBarplot$LTI == i][1:(j-1)]) +
dataBarplot$percent[dataBarplot$LTI == i][j] / 2)
}
}
dataBarplot$x_text <- 100 - x_text # Adjust text position to align with the plot
# Generate the bar plot
ggplot(data = dataBarplot) +
# Create stacked bars for proportions by age group
geom_col(
mapping = aes(x = LTI, y = percent, fill = Age),
width = 0.6, # Bar width
position = "stack", # Stacked bar position
color = "white" # Bar border color
) +
# Add percentage labels to each segment of the bar
geom_text(
aes(x = LTI, y = x_text, label = lab),
size = 2, # Font size
colour = "white" # Text color
) +
# Add titles and axis labels
labs(
title = "Long term illnesses per age group", # Title
ylab = "Proportion", # Y-axis label
xlab = "" # X-axis label (empty)
) +
# Customize the y-axis scale
scale_y_continuous(
breaks = seq(0, 100, 25), # Tick intervals
labels = c("0%", "25%", "50%", "75%", "100%") # Custom tick labels
) +
# Customize the fill colors for the "Age" categories
scale_fill_manual(
values = c('#1E5471', '#28AFB0', '#F6803C', '#82354F', '#E0D6FF'),
name = "Classe ATC1" # Legend title
) +
# Apply a minimal theme to the plot
theme_minimal() +
# Customize the legend and x-axis text
theme(
legend.position = "right", # Position the legend on the right
axis.text.x = element_text(
size = 10, # Font size
angle = 45, # Rotate text 45 degrees
vjust = 1, # Vertical alignment
hjust = 1 # Horizontal alignment
)
)
A very basic one. But adding a second axis with error-bars representing the incidence rate CI95% :
ymax <- 200
coeff <- 70/200
dataBarplot <- data.frame(
year = 2012:2022,
n_cases = c(120, 87, 78, 96, 117, 153, 139, 164, 181, 188, 163),
pop = rep(300000, 11)
) %>%
mutate(
IR = n_cases / pop * 100000,
IR_lower = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 2] * 100000,
IR_upper = epi.conf(cbind(n_cases, pop), ctype = "inc.rate", method = "exact")[, 3] * 100000
)
ggplot(dataBarplot,
aes(
x = year,
y = n_cases,
width = 0.95
)) +
geom_col(
mapping = aes(fill = "Incidence"),
fill = "#AAC0AF" # couleur des barres
) +
geom_line(
aes(
x = year,
y = IR,
colour = "Incidence rate" # Nmo dans la legende
),
linetype = 1,
size = 1,
colour = "black"
) +
geom_errorbar(
aes(
x = year,
ymin = IR_lower,
ymax = IR_upper,
width = 0.2
)
) +
geom_text(
aes(x = year,
y = n_cases,
label = n_cases),
nudge_y = 7,
size = 6
) +
# Separation de l'axe y
scale_y_continuous(
expand = c(0,0),
limits = c(0, ymax),
breaks = seq(0, ymax, 25),
name = "Incidence",
sec.axis = sec_axis(
trans = ~.*coeff,
name = "Incidence rate",
breaks = seq(0, ymax*coeff, 10)
)
) +
scale_x_continuous(
breaks = 2012:2022,
labels = 2012:2022,
name = ""
) +
theme_classic() +
theme(
panel.grid.major.y = element_line(colour = "grey"),
panel.grid.minor.y = element_line(colour = "lightgray"),
axis.text.x = element_text(size = 11),
axis.line.y.left = element_line(color = "#AAC0AF"),
axis.title.y.left = element_text(color = "#AAC0AF"),
axis.text.y.left = element_text(color = "#AAC0AF")
) + # Titres
labs(
title = "",
fill = "",
x = " ",
y = " "
)
I don’t really enjoy boxplots as I feel they get quite boring. But you can add some twists to make it interesting !
Basic setup :
# Create a data frame for boxplot visualization
dataBoxplot <- data.frame(
med_id = rep(1:8, c(8, 3, 3, 5, 7, 6, 4, 11)), # Assign medical operator IDs, repeated based on surgery counts
op_id = c(1:8, 1:3, 1:3, 1:5, 1:7, 1:6, 1:4, 1:11), # Assign operation IDs for each surgery
setup = pmax(10 - rpois(47, lambda = 4), 1) # Simulate 'setup' scores, capped at 10 and minimum of 1
)
# Create a data frame for the line plot (average setup values per operation ID)
dataLine <- data.frame(
op_id = 1:max(dataBoxplot$op_id), # Sequence of operation IDs
setup = as.numeric(by(dataBoxplot$setup, dataBoxplot$op_id, mean)) # Compute mean 'setup' scores by op_id
)
# Generate the plot using ggplot2
ggplot(dataBoxplot) +
# Add a boxplot layer for setup scores grouped by operation ID
geom_boxplot(
aes(group = op_id, x = op_id, y = setup) # Map operation ID and setup scores to boxplot
) +
# Add a red line connecting the mean setup values for each operation ID
geom_line(
data = dataLine, # Use the data frame with mean setup values
aes(op_id, setup), # Map operation ID to x and mean setup to y
color = "red" # Set line color to red
) +
# Add red points for the mean setup values
geom_point(
data = dataLine, # Use the data frame with mean setup values
aes(op_id, setup), # Map operation ID to x and mean setup to y
color = "red", # Set point color to red
shape = 16 # Use solid circle for points
) +
annotate(
"text",
x = 1:max(dataBoxplot$op_id), y = 10,
label = paste0("(n=", table(dataBoxplot$op_id),")")
) +
# Add a subtitle with the p-value from a linear model (setup ~ op_id)
labs(subtitle = paste0("p=", round(summary(lm(setup ~ op_id, dataBoxplot))$coefficient[2, 4], 4))) +
# Label the y-axis
ylab("Surgery setup grade (out of 10)") +
# Label the x-axis
xlab("Number of surgeries") +
# Customize the y-axis scale
scale_y_continuous(
limits = c(0, 10.5), # Set y-axis limits between 0 and 10
breaks = 0:10, # Add breaks at each integer value
expand = c(0, 0) # Remove padding on the axis
) +
# Customize the x-axis scale
scale_x_discrete(
limits = 1:max(dataBoxplot$op_id), # Limit x-axis to the range of operation IDs
breaks = 1:max(dataBoxplot$op_id) # Add breaks at each operation ID
) +
# Apply a minimal theme for clean visualization
theme_minimal()
Adding some two-by-two testing :
# Simulate data for a boxplot
dataBoxplot <- data.frame(
id = rep(1:47, 5), # 47 subjects, repeated 5 times for each group
times = rep(paste0("Month ", c(0, 3, 6, 9, 12)), each = 47), # 5 time points
val = c( # Simulate values for each time point with different means and SDs
rnorm(n = 47, mean = 2, sd = 1), # Group 1 (mean=2, sd=1)
rnorm(n = 47, mean = 5, sd = 1), # Group 2 (mean=5, sd=1)
rnorm(n = 47, mean = 5, sd = 2), # Group 3 (mean=5, sd=2)
rnorm(n = 47, mean = 8, sd = 2), # Group 4 (mean=8, sd=2)
rnorm(n = 47, mean = 8, sd = 5) # Group 5 (mean=8, sd=5)
)
)
dataBoxplot$val <- pmax(0, dataBoxplot$val)
# Perform ANOVA to test differences between the groups
res.aov <- anova_test(
data = dataBoxplot,
dv = val, # Dependent variable: val (size)
wid = id, # Within-subjects factor: id
between = times # Between-subjects factor: times
)
# Pairwise t-tests to compare each group with group 1, adjusting for multiple comparisons using Bonferroni correction
pwc <- dataBoxplot %>%
pairwise_t_test(
val ~ times, # Dependent variable 'val' across levels of 'times'
paired = TRUE, # Paired samples
ref.group = "Month 0", # Use "n°1" as the reference group
p.adjust.method = "bonferroni" # Apply Bonferroni correction to p-values
) %>%
add_xy_position(x = "times") # Add xy-position for displaying p-values
# Create the boxplot with individual data points and p-values
ggboxplot(
dataBoxplot, # Data for the boxplot
x = "times", # Group by 'times'
y = "val", # Plot 'val' (size) on the y-axis
add = "point", # Add individual data points to the boxplot
ylab = "Size", # Label for the y-axis
xlab = "" # No label for the x-axis
) +
scale_y_continuous(
limits = c(0, 25),
expand = c(0, 0)
) +
stat_pvalue_manual(pwc) + # Add p-values from pairwise t-tests
labs(
subtitle = get_test_label(res.aov, detailed = FALSE), # Subtitle for ANOVA result
caption = "2 by 2 testing: Student's t-test, p-value adjustment: Bonferroni" # Caption explaining test details
)
Switching to violins plots to add some info and some nice colors :
ggstatsplot::ggbetweenstats(ISLR::Wage, education, wage, "np") +
ylab("Workers raw wage") +
xlab("Education level")
This section is dedicated to regression models outputs and other associated graph.
Starting with the survival analysis, the most classic graph is the descending survival curve :
n <- 1000 # Number of individuals
df <- data.frame(
id = 1:n,
time = as.numeric(sample(1:365, n, replace = TRUE)), # Simulated follow-up time between 1 and 365 days
event = factor(sample(0:1, n, prob = c(0.4, 0.6), replace = TRUE)), # Event (0 = censored, 1 = sick, 2 = dead)
exposure = sample(c("Unexposed", "Exposed"), n, replace = TRUE) # Exposure status (Unexposed or Exposed)
)
df$time[df$event == 0] <- 365 # Set time to 365 for censored observations (event == 0)
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = df) # Fitting a Survival Model
ymax <- 30
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) +
geom_step(aes(linetype = strata, color = event), size = 0.8) +
scale_linetype_manual(name = "Group",
values = c('dotted', 'solid'),
labels = c('Non-exposed', 'Exposed')) +
scale_color_manual(name = "Event",
values = c("white", "#1C4073"),
labels = c(' ', "Hospitalised")) +
labs(x = "Période de suivi (jours)", y = "Proportion", title = "") +
geom_vline(aes(xintercept = 0), size = 1) +
geom_hline(aes(yintercept = 1-ymax/100), size = 1) +
scale_y_reverse(limits = 1-c(ymax/100, 1), breaks = 1-seq(ymax/100, 1, 0.1),
labels = paste0(seq(ymax, 100, 10), "%"), expand = c(0,0)) +
scale_x_continuous(breaks = seq(0, 360, 60), expand = c(0,0)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="black", size = 12),
axis.title.y = element_text(face="bold", colour="black", size = 12),
legend.title = element_text(face="bold", size = 10),
panel.grid.major.y = element_line(colour = "lightgray", size = 0.3),
panel.grid.minor.x = element_blank(),
legend.position = c(0.15, 0.30),
legend.background = element_rect(fill = "transparent", color = "white"))
Let’s add some even and have an ascending component :
# Simulating survival data
n <- 1000 # Number of individuals
df <- data.frame(
id = 1:n,
time = as.numeric(sample(1:365, n, replace = TRUE)), # Simulated follow-up time between 1 and 365 days
event = factor(sample(0:2, n, prob = c(0.4, 0.5, 0.1), replace = TRUE)), # Event (0 = censored, 1 = sick, 2 = dead)
exposure = sample(c("Unexposed", "Exposed"), n, replace = TRUE) # Exposure status (Unexposed or Exposed)
)
df$time[df$event == 0] <- 365 # Set time to 365 for censored observations (event == 0)
fit <- survfit(Surv(time = time, event = event) ~ exposure, data = df) # Fitting a Survival Model
# Plot a survival curve with `autoplot` and enhance it with additional layers
autoplot(fit, surv.linetype = 'blank', surv.colour = NA) +
# Add step lines representing survival curves
geom_step(
aes(linetype = strata, color = event), # Map line type to `strata` and color to `event`
size = 0.8 # Set line thickness
) +
# Customize the line types for survival curves
scale_linetype_manual(
name = "Group", # Title for the legend
values = c('dashed', 'solid'), # Define line types: dashed for unexposed, solid for exposed
labels = c('Unexposed', 'Exposed') # Labels for the legend
) +
# Customize the colors for the event states
scale_color_manual(
name = "State", # Title for the legend
values = c("#AAC0AF", "#1C4073", "#B28B84"), # Colors for Healthy, Sick, and Dead states
labels = c('Healthy', 'Sick', 'Dead') # Labels for the legend
) +
# Add axis labels and a title
labs(
x = "Followup period (days)", # X-axis label
y = "Proportion of the population", # Y-axis label
title = "" # Title (empty for now)
) +
# Add a vertical reference line at x = 0
geom_vline(
aes(xintercept = 0), # Position of the line
size = 1 # Thickness of the line
) +
# Customize the y-axis scale
scale_y_continuous(
limits = c(0, 1), # Set the y-axis range from 0 to 1
breaks = seq(0, 1, 0.1), # Major ticks every 0.1
labels = paste0(seq(0, 100, 10), "%"), # Convert tick labels to percentages
expand = c(0, 0) # Remove padding on the y-axis
) +
# Customize the x-axis scale
scale_x_continuous(
breaks = seq(0, 360, 60), # Major ticks every 60 days
expand = c(0, 0) # Remove padding on the x-axis
) +
# Apply a minimal theme for a clean appearance
theme_minimal() +
# Customize specific theme elements
theme(
plot.title = element_text(hjust = 0.5), # Center-align the plot title
axis.title.x = element_text(
face = "bold", # Make x-axis title bold
colour = "black", # Set title color to black
size = 12 # Set font size
),
axis.title.y = element_text(
face = "bold", # Make y-axis title bold
colour = "black", # Set title color to black
size = 12 # Set font size
),
legend.title = element_text(face = "bold", size = 10), # Style legend titles
panel.grid.major.y = element_line(
colour = "lightgray", # Light gray major grid lines on the y-axis
size = 0.3 # Set line thickness
),
panel.grid.minor.x = element_blank() # Remove minor grid lines on the x-axis
)